R Markdown
setwd("/Users/kenyaroy/doing data science/unit 8")
Beers = read.csv("/Users/kenyaroy/doing data science/unit 8/Beers.csv")
Breweries = read.csv("/Users/kenyaroy/doing data science/unit 8/Breweries.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(naniar)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(usmap)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ purrr 1.0.1
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(class)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
#How many breweries are present in each state
Breweries %>% group_by(State) %>% dplyr::summarize(n()) %>% print(n=51)
## # A tibble: 51 × 2
## State `n()`
## <chr> <int>
## 1 " AK" 7
## 2 " AL" 3
## 3 " AR" 2
## 4 " AZ" 11
## 5 " CA" 39
## 6 " CO" 47
## 7 " CT" 8
## 8 " DC" 1
## 9 " DE" 2
## 10 " FL" 15
## 11 " GA" 7
## 12 " HI" 4
## 13 " IA" 5
## 14 " ID" 5
## 15 " IL" 18
## 16 " IN" 22
## 17 " KS" 3
## 18 " KY" 4
## 19 " LA" 5
## 20 " MA" 23
## 21 " MD" 7
## 22 " ME" 9
## 23 " MI" 32
## 24 " MN" 12
## 25 " MO" 9
## 26 " MS" 2
## 27 " MT" 9
## 28 " NC" 19
## 29 " ND" 1
## 30 " NE" 5
## 31 " NH" 3
## 32 " NJ" 3
## 33 " NM" 4
## 34 " NV" 2
## 35 " NY" 16
## 36 " OH" 15
## 37 " OK" 6
## 38 " OR" 29
## 39 " PA" 25
## 40 " RI" 5
## 41 " SC" 4
## 42 " SD" 1
## 43 " TN" 3
## 44 " TX" 28
## 45 " UT" 4
## 46 " VA" 16
## 47 " VT" 10
## 48 " WA" 23
## 49 " WI" 20
## 50 " WV" 1
## 51 " WY" 4
Breweries %>% ggplot(aes(x = State, fill = State)) + geom_bar() + ggtitle("Distribution of Breweries by State") + ylab(" # of Breweries") + geom_text(stat = "count", aes(label = after_stat(count)), vjust = 0) + theme(legend.position = "none")

#Here, we found the number of breweries per state. The numbers are listed above the bars for each corresponding state.
#Heat Map of breweries in each state.
library(usmap)
StateBeerC = data.frame(state = c("AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID","IL","IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE","NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VA","VT","WA","WI","WV","WY"),values = c(7,3,2,11,39,47,8,1,2,15,7,4,5,5,18,22,3,4,5,23,7,9,32,12,9,2,9,19,1,5,3,3,4,2,16,15,6,29,25,5,4,1,3,28,4,16,10,23,20,1,4))
plot_usmap(data = StateBeerC, values = "values", regions = "state") + scale_fill_continuous(low = "yellow", high = "red", name = "Number of Breweries", label = scales::comma) + labs(title = "Number of Breweries By State", ) + theme(legend.position = "right")

#Using the heatmap, you can see the location of states with greater numbers of breweries (including Colorado, California, Michigan, Texas, Pennsylvania, etc.)
#Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the #merged file.
Breweries$Brewery_id = Breweries$Brew_ID
Breweries <- Breweries %>% select(Brewery_id, Name, City, State)
BB <- merge(Beers,Breweries, by = "Brewery_id", all = TRUE)
dim(BB)
## [1] 2410 10
summary(BB)
## Brewery_id Name.x Beer_ID ABV
## Min. : 1.0 Length:2410 Min. : 1.0 Min. :0.00100
## 1st Qu.: 94.0 Class :character 1st Qu.: 808.2 1st Qu.:0.05000
## Median :206.0 Mode :character Median :1453.5 Median :0.05600
## Mean :232.7 Mean :1431.1 Mean :0.05977
## 3rd Qu.:367.0 3rd Qu.:2075.8 3rd Qu.:0.06700
## Max. :558.0 Max. :2692.0 Max. :0.12800
## NA's :62
## IBU Style Ounces Name.y
## Min. : 4.00 Length:2410 Min. : 8.40 Length:2410
## 1st Qu.: 21.00 Class :character 1st Qu.:12.00 Class :character
## Median : 35.00 Mode :character Median :12.00 Mode :character
## Mean : 42.71 Mean :13.59
## 3rd Qu.: 64.00 3rd Qu.:16.00
## Max. :138.00 Max. :32.00
## NA's :1005
## City State
## Length:2410 Length:2410
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
head(BB)
## Brewery_id Name.x Beer_ID ABV IBU
## 1 1 Get Together 2692 0.045 50
## 2 1 Maggie's Leap 2691 0.049 26
## 3 1 Wall's End 2690 0.048 19
## 4 1 Pumpion 2689 0.060 38
## 5 1 Stronghold 2688 0.060 25
## 6 1 Parapet ESB 2687 0.056 47
## Style Ounces Name.y City
## 1 American IPA 16 NorthGate Brewing Minneapolis
## 2 Milk / Sweet Stout 16 NorthGate Brewing Minneapolis
## 3 English Brown Ale 16 NorthGate Brewing Minneapolis
## 4 Pumpkin Ale 16 NorthGate Brewing Minneapolis
## 5 American Porter 16 NorthGate Brewing Minneapolis
## 6 Extra Special / Strong Bitter (ESB) 16 NorthGate Brewing Minneapolis
## State
## 1 MN
## 2 MN
## 3 MN
## 4 MN
## 5 MN
## 6 MN
tail(BB)
## Brewery_id Name.x Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Name.y City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
#Here you can see the first and last 6 observations in the merged beers and breweries file.
#Address the missing values in each column
#https://www.masterclass.com/articles/ibu-beer
#There are styles of beer and each style has a range of IBU, we could use the sytle of the beer to assign an average value
MeanIBU <- BB %>% filter(!is.na(IBU)) %>% group_by(Style) %>% dplyr::summarize(IBUMean = mean(IBU))
TestBB <- merge(BB,MeanIBU, by="Style")
TestBB$IBU[is.na(TestBB$IBU)] <- TestBB$IBUMean[is.na(TestBB$IBU)]
MeanABV <- BB %>% filter(!is.na(ABV)) %>% group_by(Style) %>% dplyr::summarize(ABVMean=mean(ABV))
TestBB1 <- merge(TestBB,MeanABV, by="Style")
TestBB1$ABV[is.na(TestBB1$ABV)] <- TestBB1$ABVMean[is.na(TestBB1$ABV)]
gg_miss_var(TestBB1) + ggtitle("Missing Values in Dataset")

#We determined that the missing IBU and ABV values were missing completely at random (MCAR) because we found that no other variable (State, Style, Ounces) impacted the probability of missingness. Using our domain knowledge, we learned that every style of beer has a range of IBU and ABV values. We determined that imputation would be the best method of addressing missing values as opposed to listwise deletion.In order to find the missing values in ABV and IBU, we calculated the mean of each variable by Style and imputed #them into BB dataframe.
#Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
TestBB %>% ggplot(aes(x = State, y = IBU, fill = State)) + geom_bar(position = "dodge", stat = "summary", fun="median") + ggtitle("Distribution of Median IBU of Breweries by State") + ylab("International Bitterness Unit") + theme(legend.position="none")

#We found the median IBU for each state and plotted a bar chart for viewing. We found that DE and WV stood out from the rest. Both DE and WV have only two APA beers with high IBU values (which explain why their medians are higher) which skew the distribution. Median ABV are less varied than median international bitter unit values There doesn't appear to be much skewness in the IBU data. It has more of a uniform distribution with exception to two states (DE and WV).
#Compute the median alcohol content for each state. Plot a bar chart to compare.
TestBB1 %>% ggplot(aes(x = State, y = ABV, fill = State)) + geom_bar(position = "dodge", stat = "summary", fun="median") + ggtitle("Distribution of Median ABV of Breweries by State") + ylab("Alcohol By Volume") + theme(legend.position="none")

#ABV has more of a uniform distribution with very little peaks. This means that there's a probability that all ABV values are equally likely across states.
#Which state has the most bitter (IBU) beer?
TestBB %>% dplyr::summarise(max(IBU))
## max(IBU)
## 1 138
TestBB %>% filter(IBU == "138")
## Style Brewery_id Name.x Beer_ID
## 1 American Double / Imperial IPA 375 Bitter Bitch Imperial IPA 980
## ABV IBU Ounces Name.y City State IBUMean
## 1 0.082 138 12 Astoria Brewing Company Astoria OR 93.32
p <- BB %>% ggplot(aes(x=IBU, fill=State)) + geom_bar() + ggtitle("Finding State with Maximum IBU") + theme(legend.position="none") + ylab("Beers")
ggplotly(p)
## Warning: Removed 1005 rows containing non-finite values (`stat_count()`).
#The state with the most bitter (IBU) beer is Oregon.
#Which state has the maximum alcoholic (ABV) beer?
TestBB1 %>% dplyr::summarise(max(ABV))
## max(ABV)
## 1 0.128
TestBB1 %>% filter(ABV == "0.128")
## Style Brewery_id
## 1 Quadrupel (Quad) 52
## Name.x Beer_ID ABV IBU Ounces
## 1 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale 2565 0.128 24 19.2
## Name.y City State IBUMean ABVMean
## 1 Upslope Brewing Company Boulder CO 24 0.104
p <- BB %>% ggplot(aes(x=ABV, fill=State)) + geom_bar() + ggtitle("Finding State with Maximum ABV") + theme(legend.position="none") + ylab("Beers")
ggplotly(p)
## Warning: Removed 62 rows containing non-finite values (`stat_count()`).
#The state with the maximum alcoholic (ABV) beer is Colorado.
#Grab the summary statistics of ABV
summary(TestBB1$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02700 0.05000 0.05600 0.05972 0.06700 0.12800
#We found that while the measure of spread is large, many beers, found in the 50th percentile of ABV values, fall between 0.050 (Q1) and 0.067 (Q3), which is a small range. There are also many beers with ABVs beyond the third quartile (greater than 0.067)
#Distribution of ABV versus IBU
BB %>% filter(!is.na(ABV) & !is.na(IBU)) %>% select(ABV, IBU) %>% ggpairs()

BB %>% ggplot(aes(x=ABV, y=IBU)) + geom_point() +geom_smooth(position="jitter") + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 1005 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 1005 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1005 rows containing missing values (`geom_point()`).

TestBB1 %>% select(ABV, IBU) %>% ggpairs()

TestBB1 %>% ggplot(aes(x=ABV, y=IBU)) + geom_point() +geom_smooth(position="jitter") + geom_smooth() + ggtitle("Distribution of ABV versus IBU")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#There isn't an apparent relationship. We don't have enough evidence to claim that there is a linear relationship between ABV and IBU in this dataset.There appears to be many beers with high ABV and low IBU values. If there were less of these values, the curve would be higher and would likely show a linear relationship in the scatterplot.
#We suspect specific styles are skewing this data.There are many styles of beer in this dataset, specifically 91 styles. It would be interesting to do a case study identifying which style(s) are skewing the distribution of ABV and IBU values.
#Distribution of IBU versus ABV by Style (IPA and Ale)
library(dplyr)
IPA <- TestBB1 %>% filter(grepl('IPA', Style))
Ale <- TestBB1 %>% filter(grepl('Ale', Style))
head(IPA)
## Style Brewery_id Name.x Beer_ID
## 1 American Double / Imperial IPA 167 Gordon Imperial Red (2010) 806
## 2 American Double / Imperial IPA 215 More Cowbell 2123
## 3 American Double / Imperial IPA 504 Gordon (2005) 805
## 4 American Double / Imperial IPA 167 GUBNA Imperial IPA 6
## 5 American Double / Imperial IPA 375 Bitter Bitch Imperial IPA 980
## 6 American Double / Imperial IPA 175 MechaHopzilla 1114
## ABV IBU Ounces Name.y City State
## 1 0.087 85.00 12 Oskar Blues Brewery Longmont CO
## 2 0.090 118.00 16 Buffalo Bayou Brewing Company Houston TX
## 3 0.092 85.00 12 Oskar Blues Brewery Lyons CO
## 4 0.099 100.00 12 Oskar Blues Brewery Longmont CO
## 5 0.082 138.00 12 Astoria Brewing Company Astoria OR
## 6 0.088 93.32 16 New Orleans Lager & Ale Brewing ... New Orleans LA
## IBUMean ABVMean
## 1 93.32 0.08736893
## 2 93.32 0.08736893
## 3 93.32 0.08736893
## 4 93.32 0.08736893
## 5 93.32 0.08736893
## 6 93.32 0.08736893
head(Ale)
## Style Brewery_id Name.x Beer_ID
## 1 Abbey Single Ale 58 Abbey's Single (2015- ) 2505
## 2 Abbey Single Ale 58 Abbey's Single Ale (Current) 1887
## 3 American Amber / Red Ale 387 Colorado Red Ale 220
## 4 American Amber / Red Ale 61 Reprise Centennial Red 2288
## 5 American Amber / Red Ale 487 Wisco Disco 953
## 6 American Amber / Red Ale 202 Loki Red Ale 2191
## ABV IBU Ounces Name.y City State IBUMean
## 1 0.049 22.0000 12 Destihl Brewery Bloomington IL 22.0000
## 2 0.049 22.0000 12 Destihl Brewery Bloomington IL 22.0000
## 3 0.062 36.2987 12 Revolution Brewing Paonia CO 36.2987
## 4 0.060 36.2987 12 4 Hands Brewing Company Saint Louis MO 36.2987
## 5 0.051 31.0000 16 Stillmank Beer Company Green Bay WI 36.2987
## 6 0.075 53.0000 16 Fearless Brewing Company Estacada OR 36.2987
## ABVMean
## 1 0.049000
## 2 0.049000
## 3 0.057456
## 4 0.057456
## 5 0.057456
## 6 0.057456
BBIPAAle <- TestBB1 %>% filter(str_detect(Style, "IPA")|str_detect(Style, " Ale"))
BBIPAAle$AleType <- ifelse(str_detect(BBIPAAle$Style,"IPA"),"IPA","Ale")
BBIPAAle %>% ggplot(aes(x=ABV, y = IBU, color = AleType)) + geom_point() +ggtitle("Distribution of IBU versus ABV by Style (IPA and Ale)")

#We found that IPAs have higher ABV and IBU values and Ales have lower ABV and IBU values.
#Internal classification knn
classifications = knn.cv(BBIPAAle[,c(5,6)],BBIPAAle$AleType, prob = TRUE, k = 5)
table(classifications,BBIPAAle$AleType)
##
## classifications Ale IPA
## Ale 894 66
## IPA 68 505
CM = confusionMatrix(table(classifications,BBIPAAle$AleType))
CM
## Confusion Matrix and Statistics
##
##
## classifications Ale IPA
## Ale 894 66
## IPA 68 505
##
## Accuracy : 0.9126
## 95% CI : (0.8973, 0.9263)
## No Information Rate : 0.6275
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8131
##
## Mcnemar's Test P-Value : 0.9312
##
## Sensitivity : 0.9293
## Specificity : 0.8844
## Pos Pred Value : 0.9313
## Neg Pred Value : 0.8813
## Prevalence : 0.6275
## Detection Rate : 0.5832
## Detection Prevalence : 0.6262
## Balanced Accuracy : 0.9069
##
## 'Positive' Class : Ale
##
#To test our hypothesis in the previous plot, we conducted an internal KNN test. Note this was not included in the comparison slide in the presentation.
#Naive Bayes
splitPerc = .7 #Training / Test split Percentage
trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
train = BBIPAAle[trainI,]
test = BBIPAAle[-trainI,]
model = naiveBayes(train[,c(5,6)],train$AleType)
table(predict(model,test[,c(5,6)]),test$AleType)
##
## Ale IPA
## Ale 259 18
## IPA 37 146
CM = confusionMatrix(table(predict(model,test[,c(5,6)]),test$AleType))
CM
## Confusion Matrix and Statistics
##
##
## Ale IPA
## Ale 259 18
## IPA 37 146
##
## Accuracy : 0.8804
## 95% CI : (0.8472, 0.9086)
## No Information Rate : 0.6435
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.746
##
## Mcnemar's Test P-Value : 0.01522
##
## Sensitivity : 0.8750
## Specificity : 0.8902
## Pos Pred Value : 0.9350
## Neg Pred Value : 0.7978
## Prevalence : 0.6435
## Detection Rate : 0.5630
## Detection Prevalence : 0.6022
## Balanced Accuracy : 0.8826
##
## 'Positive' Class : Ale
##
#To test our hypothesis in the previous plot, we conducted a Naive Bayes test (train and test sets split 70 and 30 percent).
#External classification knn (train and test)
splitPerc = .7 #Training / Test split Percentage
trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
train = BBIPAAle[trainI,]
test = BBIPAAle[-trainI,]
model = naiveBayes(train[,c(5,6)],train$AleType)
table(predict(model,test[,c(5,6)]),test$AleType)
##
## Ale IPA
## Ale 265 35
## IPA 19 141
CM = confusionMatrix(table(predict(model,test[,c(5,6)]),test$AleType))
CM
## Confusion Matrix and Statistics
##
##
## Ale IPA
## Ale 265 35
## IPA 19 141
##
## Accuracy : 0.8826
## 95% CI : (0.8496, 0.9106)
## No Information Rate : 0.6174
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7471
##
## Mcnemar's Test P-Value : 0.04123
##
## Sensitivity : 0.9331
## Specificity : 0.8011
## Pos Pred Value : 0.8833
## Neg Pred Value : 0.8813
## Prevalence : 0.6174
## Detection Rate : 0.5761
## Detection Prevalence : 0.6522
## Balanced Accuracy : 0.8671
##
## 'Positive' Class : Ale
##
#To test our hypothesis in the previous plot, we conducted a KNN test (external test with train and test sets split 70 and 30 percent) and compared it to the Naive Bayes test.
#We used both K-NN and Naïve Bayes models to get a better sense of the classification and accuracy. KNN gave a higher accuracy and sensitivity while Naïve Bayes gave a greater specificity. Using test and training sets of 70 and 30 percent produced the results in table 2. We see that Naïve Bayes produced more accurate classifications of Ales and IPAs and thus proved to be a better fit for this classification test.
#Separate dataset by beer style (IPA, Ale, Lager) and create bar chart.
BBClassify <- TestBB1
BBClassify$BeerType = BBClassify$AleType
BBClassify$BeerType = ifelse(str_detect(BBClassify$Style, "IPA"),"IPA",ifelse(str_detect(BBClassify$Style, "Stout"), "Stout",ifelse(str_detect(BBClassify$Style, "Pilsner"),"Pilsner",ifelse(str_detect(BBClassify$Style, "Beer"),"Beer",ifelse(str_detect(BBClassify$Style, " Ale"),"Ale",ifelse(str_detect(BBClassify$Style, "Lager"),"Lager","Other"))))))
BBClassify %>% filter(BeerType == "IPA" |BeerType == "Ale" |BeerType == "Lager") %>% ggplot(aes(x=IBU, y = ABV, color = BeerType)) + geom_jitter() + ggtitle("Distribution of IBU versus ABV by Style (IPA, Ale, Lager)")

BBClassify %>% ggplot(aes(x=BeerType, fill= BeerType)) + geom_bar() + ggtitle("Distribution of Beers by Beer Type") + ylab("Beers") + xlab("Beer Type")

#Interested in understanding the distribution of IBUs and ABVs amongst beer styles, we thought to compare Budweiser’s beer style (lager) with IPAs and Ales. We found that IPAs represented beers with higher IBUs and ABVs and Lagers with lower IBUs and ABVs. Ales appeared to have lower values as well with more outliers with higher values.
#We know that Budweiser is one of the foremost beer brands in the U.S. and that Budweisers customers have depended on the company to provide a great lager option since 1876. We are also aware of Budweisers recent rollout of beverages that meet the needs of audiences interested in trending beverages such as seltzers and nonalcoholic beers. Like the Bud Light Seltzer and Bud Zero, Budweiser could stand to introduce an option for those looking for high ABV and IBU IPAs from a dependable, trusted brand.